#coding:utf8

from pyspark.sql import *
import os
from pyspark.sql.types import *
import pandas as pd

if __name__ == '__main__':
    spark=SparkSession.builder.appName("test1_dataFrame_create")\
        .master("local[*]").getOrCreate()
    sc=spark.sparkContext
    localhost_path = 'file://' + os.getcwd() + '/../data/input/sql/stu_score.txt'
    df=spark.read.csv(localhost_path,"id INT,subject string,score int")
    # df.printSchema()
    # df.show()

    id_column=df['id']
    subject_column=df['subject']
    # DSL风格展示
    df.select(["id","subject"]).show()
    df.select('id','subject').show()
    df.select(id_column,subject_column).show()
    df.select([id_column,subject_column]).show()

    # df.filter = df.where
    df.filter("score < 99").show()
    df.filter(df['score'] < 99).show()

#     groupBy
#     返回值GroupedData不是DataFrame，不可以直接show
#     print(type(df.groupby("subject")))
    df.groupby("subject").count().show()
    df.groupby(df['subject']).count().show()
























































